DATA PREPROCESSING

LOADING DATASET & CHECKING str():

library(readr)
credit_card <- read.csv("credit_card.csv")
print(str(credit_card))
## 'data.frame':    10127 obs. of  23 variables:
##  $ CLIENTNUM                                                                                                                         : int  768805383 818770008 713982108 769911858 709106358 713061558 810347208 818906208 710930508 719661558 ...
##  $ Attrition_Flag                                                                                                                    : chr  "Existing Customer" "Existing Customer" "Existing Customer" "Existing Customer" ...
##  $ Customer_Age                                                                                                                      : int  45 49 51 40 40 44 51 32 37 48 ...
##  $ Gender                                                                                                                            : chr  "M" "F" "M" "F" ...
##  $ Dependent_count                                                                                                                   : int  3 5 3 4 3 2 4 0 3 2 ...
##  $ Education_Level                                                                                                                   : chr  "High School" "Graduate" "Graduate" "High School" ...
##  $ Marital_Status                                                                                                                    : chr  "Married" "Single" "Married" "Unknown" ...
##  $ Income_Category                                                                                                                   : chr  "$60K - $80K" "Less than $40K" "$80K - $120K" "Less than $40K" ...
##  $ Card_Category                                                                                                                     : chr  "Blue" "Blue" "Blue" "Blue" ...
##  $ Months_on_book                                                                                                                    : int  39 44 36 34 21 36 46 27 36 36 ...
##  $ Total_Relationship_Count                                                                                                          : int  5 6 4 3 5 3 6 2 5 6 ...
##  $ Months_Inactive_12_mon                                                                                                            : int  1 1 1 4 1 1 1 2 2 3 ...
##  $ Contacts_Count_12_mon                                                                                                             : int  3 2 0 1 0 2 3 2 0 3 ...
##  $ Credit_Limit                                                                                                                      : num  12691 8256 3418 3313 4716 ...
##  $ Total_Revolving_Bal                                                                                                               : int  777 864 0 2517 0 1247 2264 1396 2517 1677 ...
##  $ Avg_Open_To_Buy                                                                                                                   : num  11914 7392 3418 796 4716 ...
##  $ Total_Amt_Chng_Q4_Q1                                                                                                              : num  1.33 1.54 2.59 1.41 2.17 ...
##  $ Total_Trans_Amt                                                                                                                   : int  1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
##  $ Total_Trans_Ct                                                                                                                    : int  42 33 20 20 28 24 31 36 24 32 ...
##  $ Total_Ct_Chng_Q4_Q1                                                                                                               : num  1.62 3.71 2.33 2.33 2.5 ...
##  $ Avg_Utilization_Ratio                                                                                                             : num  0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
##  $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1: num  9.34e-05 5.69e-05 2.11e-05 1.34e-04 2.17e-05 ...
##  $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2: num  1 1 1 1 1 ...
## NULL

DIMENSIONS OF DATA:

dim(credit_card)
## [1] 10127    23

SUMMARY OF DATASET:

summary(credit_card)
##    CLIENTNUM        Attrition_Flag      Customer_Age     Gender         
##  Min.   :7.08e+08   Length:10127       Min.   :26.0   Length:10127      
##  1st Qu.:7.13e+08   Class :character   1st Qu.:41.0   Class :character  
##  Median :7.18e+08   Mode  :character   Median :46.0   Mode  :character  
##  Mean   :7.39e+08                      Mean   :46.3                     
##  3rd Qu.:7.73e+08                      3rd Qu.:52.0                     
##  Max.   :8.28e+08                      Max.   :73.0                     
##  Dependent_count Education_Level    Marital_Status     Income_Category   
##  Min.   :0.00    Length:10127       Length:10127       Length:10127      
##  1st Qu.:1.00    Class :character   Class :character   Class :character  
##  Median :2.00    Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2.35                                                            
##  3rd Qu.:3.00                                                            
##  Max.   :5.00                                                            
##  Card_Category      Months_on_book Total_Relationship_Count
##  Length:10127       Min.   :13.0   Min.   :1.00            
##  Class :character   1st Qu.:31.0   1st Qu.:3.00            
##  Mode  :character   Median :36.0   Median :4.00            
##                     Mean   :35.9   Mean   :3.81            
##                     3rd Qu.:40.0   3rd Qu.:5.00            
##                     Max.   :56.0   Max.   :6.00            
##  Months_Inactive_12_mon Contacts_Count_12_mon  Credit_Limit  
##  Min.   :0.00           Min.   :0.00          Min.   : 1438  
##  1st Qu.:2.00           1st Qu.:2.00          1st Qu.: 2555  
##  Median :2.00           Median :2.00          Median : 4549  
##  Mean   :2.34           Mean   :2.46          Mean   : 8632  
##  3rd Qu.:3.00           3rd Qu.:3.00          3rd Qu.:11068  
##  Max.   :6.00           Max.   :6.00          Max.   :34516  
##  Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt
##  Min.   :   0        Min.   :    3   Min.   :0.00         Min.   :  510  
##  1st Qu.: 359        1st Qu.: 1324   1st Qu.:0.63         1st Qu.: 2156  
##  Median :1276        Median : 3474   Median :0.74         Median : 3899  
##  Mean   :1163        Mean   : 7469   Mean   :0.76         Mean   : 4404  
##  3rd Qu.:1784        3rd Qu.: 9859   3rd Qu.:0.86         3rd Qu.: 4741  
##  Max.   :2517        Max.   :34516   Max.   :3.40         Max.   :18484  
##  Total_Trans_Ct  Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
##  Min.   : 10.0   Min.   :0.00        Min.   :0.000        
##  1st Qu.: 45.0   1st Qu.:0.58        1st Qu.:0.023        
##  Median : 67.0   Median :0.70        Median :0.176        
##  Mean   : 64.9   Mean   :0.71        Mean   :0.275        
##  3rd Qu.: 81.0   3rd Qu.:0.82        3rd Qu.:0.503        
##  Max.   :139.0   Max.   :3.71        Max.   :0.999        
##  Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1
##  Min.   :0.00                                                                                                                      
##  1st Qu.:0.00                                                                                                                      
##  Median :0.00                                                                                                                      
##  Mean   :0.16                                                                                                                      
##  3rd Qu.:0.00                                                                                                                      
##  Max.   :1.00                                                                                                                      
##  Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
##  Min.   :0.00                                                                                                                      
##  1st Qu.:1.00                                                                                                                      
##  Median :1.00                                                                                                                      
##  Mean   :0.84                                                                                                                      
##  3rd Qu.:1.00                                                                                                                      
##  Max.   :1.00

COLUMN NAMES IN THE DATASET:

column_names <- colnames(credit_card)
last_two_columns <- tail(column_names, 2)
print(column_names)
##  [1] "CLIENTNUM"                                                                                                                         
##  [2] "Attrition_Flag"                                                                                                                    
##  [3] "Customer_Age"                                                                                                                      
##  [4] "Gender"                                                                                                                            
##  [5] "Dependent_count"                                                                                                                   
##  [6] "Education_Level"                                                                                                                   
##  [7] "Marital_Status"                                                                                                                    
##  [8] "Income_Category"                                                                                                                   
##  [9] "Card_Category"                                                                                                                     
## [10] "Months_on_book"                                                                                                                    
## [11] "Total_Relationship_Count"                                                                                                          
## [12] "Months_Inactive_12_mon"                                                                                                            
## [13] "Contacts_Count_12_mon"                                                                                                             
## [14] "Credit_Limit"                                                                                                                      
## [15] "Total_Revolving_Bal"                                                                                                               
## [16] "Avg_Open_To_Buy"                                                                                                                   
## [17] "Total_Amt_Chng_Q4_Q1"                                                                                                              
## [18] "Total_Trans_Amt"                                                                                                                   
## [19] "Total_Trans_Ct"                                                                                                                    
## [20] "Total_Ct_Chng_Q4_Q1"                                                                                                               
## [21] "Avg_Utilization_Ratio"                                                                                                             
## [22] "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"
## [23] "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"

DATA TYPES OF COLUMNS:

column_data_types <- sapply(credit_card, class)
print(column_data_types)
##                                                                                                                          CLIENTNUM 
##                                                                                                                          "integer" 
##                                                                                                                     Attrition_Flag 
##                                                                                                                        "character" 
##                                                                                                                       Customer_Age 
##                                                                                                                          "integer" 
##                                                                                                                             Gender 
##                                                                                                                        "character" 
##                                                                                                                    Dependent_count 
##                                                                                                                          "integer" 
##                                                                                                                    Education_Level 
##                                                                                                                        "character" 
##                                                                                                                     Marital_Status 
##                                                                                                                        "character" 
##                                                                                                                    Income_Category 
##                                                                                                                        "character" 
##                                                                                                                      Card_Category 
##                                                                                                                        "character" 
##                                                                                                                     Months_on_book 
##                                                                                                                          "integer" 
##                                                                                                           Total_Relationship_Count 
##                                                                                                                          "integer" 
##                                                                                                             Months_Inactive_12_mon 
##                                                                                                                          "integer" 
##                                                                                                              Contacts_Count_12_mon 
##                                                                                                                          "integer" 
##                                                                                                                       Credit_Limit 
##                                                                                                                          "numeric" 
##                                                                                                                Total_Revolving_Bal 
##                                                                                                                          "integer" 
##                                                                                                                    Avg_Open_To_Buy 
##                                                                                                                          "numeric" 
##                                                                                                               Total_Amt_Chng_Q4_Q1 
##                                                                                                                          "numeric" 
##                                                                                                                    Total_Trans_Amt 
##                                                                                                                          "integer" 
##                                                                                                                     Total_Trans_Ct 
##                                                                                                                          "integer" 
##                                                                                                                Total_Ct_Chng_Q4_Q1 
##                                                                                                                          "numeric" 
##                                                                                                              Avg_Utilization_Ratio 
##                                                                                                                          "numeric" 
## Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1 
##                                                                                                                          "numeric" 
## Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2 
##                                                                                                                          "numeric"

AFTER DROPING UNNECESSARY COLUMNS:

credit_card_final <- subset(credit_card,select = !colnames(credit_card) %in% last_two_columns)
print(colnames(credit_card_final))
##  [1] "CLIENTNUM"                "Attrition_Flag"          
##  [3] "Customer_Age"             "Gender"                  
##  [5] "Dependent_count"          "Education_Level"         
##  [7] "Marital_Status"           "Income_Category"         
##  [9] "Card_Category"            "Months_on_book"          
## [11] "Total_Relationship_Count" "Months_Inactive_12_mon"  
## [13] "Contacts_Count_12_mon"    "Credit_Limit"            
## [15] "Total_Revolving_Bal"      "Avg_Open_To_Buy"         
## [17] "Total_Amt_Chng_Q4_Q1"     "Total_Trans_Amt"         
## [19] "Total_Trans_Ct"           "Total_Ct_Chng_Q4_Q1"     
## [21] "Avg_Utilization_Ratio"

CHECKING FOR NULL VALUES

null_counts <- colSums(is.na(credit_card_final))
print(null_counts)
##                CLIENTNUM           Attrition_Flag             Customer_Age 
##                        0                        0                        0 
##                   Gender          Dependent_count          Education_Level 
##                        0                        0                        0 
##           Marital_Status          Income_Category            Card_Category 
##                        0                        0                        0 
##           Months_on_book Total_Relationship_Count   Months_Inactive_12_mon 
##                        0                        0                        0 
##    Contacts_Count_12_mon             Credit_Limit      Total_Revolving_Bal 
##                        0                        0                        0 
##          Avg_Open_To_Buy     Total_Amt_Chng_Q4_Q1          Total_Trans_Amt 
##                        0                        0                        0 
##           Total_Trans_Ct      Total_Ct_Chng_Q4_Q1    Avg_Utilization_Ratio 
##                        0                        0                        0

REPLACING SPACES WITH NA AND CHECKING FOR NULL VALUES OR UNFILLED VALUES

credit_card_final[credit_card_final == " "] <- NA
null_counts1 <- colSums(is.na(credit_card_final))
print(null_counts1)
##                CLIENTNUM           Attrition_Flag             Customer_Age 
##                        0                        0                        0 
##                   Gender          Dependent_count          Education_Level 
##                        0                        0                        0 
##           Marital_Status          Income_Category            Card_Category 
##                        0                        0                        0 
##           Months_on_book Total_Relationship_Count   Months_Inactive_12_mon 
##                        0                        0                        0 
##    Contacts_Count_12_mon             Credit_Limit      Total_Revolving_Bal 
##                        0                        0                        0 
##          Avg_Open_To_Buy     Total_Amt_Chng_Q4_Q1          Total_Trans_Amt 
##                        0                        0                        0 
##           Total_Trans_Ct      Total_Ct_Chng_Q4_Q1    Avg_Utilization_Ratio 
##                        0                        0                        0

CATEGORICAL VARIABLES:

library(dplyr)

# Identify categorical variables (regardless of data type)
categorical_variables <- credit_card_final %>%
  select_if(function(col) is.character(col) || is.factor(col)) %>%
  names()

print(categorical_variables)
## [1] "Attrition_Flag"  "Gender"          "Education_Level" "Marital_Status" 
## [5] "Income_Category" "Card_Category"

NUMERICAL VARIABLES:

library(dplyr)

# Identify numerical variables (excluding character and factor)
numerical_variables <- credit_card_final %>%
  select_if(function(col) is.numeric(col)) %>%
  names()

# Separate continuous and discrete variables
continuous_variables <- credit_card_final %>%
  select(numerical_variables) %>%
  summarise_all(function(col) isTRUE(all.equal(col, round(col))))

discrete_variables <- setdiff(numerical_variables, continuous_variables)

# Combine continuous and discrete variables into a single output
variables_summary <- list(Continuous = continuous_variables, Discrete = discrete_variables)

# Print the variables summary
print(discrete_variables)
##  [1] "CLIENTNUM"                "Customer_Age"            
##  [3] "Dependent_count"          "Months_on_book"          
##  [5] "Total_Relationship_Count" "Months_Inactive_12_mon"  
##  [7] "Contacts_Count_12_mon"    "Credit_Limit"            
##  [9] "Total_Revolving_Bal"      "Avg_Open_To_Buy"         
## [11] "Total_Amt_Chng_Q4_Q1"     "Total_Trans_Amt"         
## [13] "Total_Trans_Ct"           "Total_Ct_Chng_Q4_Q1"     
## [15] "Avg_Utilization_Ratio"
print(continuous_variables)
##   CLIENTNUM Customer_Age Dependent_count Months_on_book
## 1      TRUE         TRUE            TRUE           TRUE
##   Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon
## 1                     TRUE                   TRUE                  TRUE
##   Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1
## 1        FALSE                TRUE           FALSE                FALSE
##   Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 1            TRUE           TRUE               FALSE                 FALSE

EDA –> “Exploratory Data Analysis”

DATA DISTRIBUTION OF CATEGORICAL VARIABLES:

library(ggplot2)

categorical_vars_ggplot <- c("Attrition_Flag", "Gender", "Education_Level", "Marital_Status", "Income_Category", "Card_Category")

for (cat_var in categorical_vars_ggplot) {
  plot_obj <- ggplot(credit_card_final, aes_string(x = cat_var, fill = cat_var)) + 
    geom_bar() +
    geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=-0.5) +
    labs(title = paste("Distribution of", cat_var), x = cat_var, y = "Count") +
    scale_fill_brewer(palette="Set3") + 
    theme_minimal() +  
    theme(legend.position="none")  
  
  print(plot_obj)
}

# Load necessary libraries
library(ggplot2)

# Create a bar graph
ggplot(credit_card_final, aes(x = Card_Category, y = Credit_Limit, fill = Card_Category)) +
  geom_bar(stat = "identity") +
  labs(title = "Credit Limit by Card Category",
       x = "Card Category",
       y = "Credit Limit") +
  theme_minimal()

# Load necessary libraries
#library(ggplot2)

# Create a bar graph with facets
#ggplot(credit_card_final, aes(x = Gender, y = Avg_Utilization_Ratio, fill = Gender)) +
 # geom_bar(stat = "identity") +
  #labs(title = "Avg Utilization Ratio and Avg Open To Buy by Gender",
    #   x = "Gender",
   #    y = "Avg Utilization Ratio") +
  #facet_wrap(~ Avg_Open_To_Buy, ncol = 3) +
  #theme_minimal()

CUSTOMER SEGMENTATION:

# Load necessary libraries
library(dplyr)
library(ggplot2)

# Select relevant attributes for segmentation
selected_attributes <- credit_card_final %>% 
  select(Customer_Age, Income_Category, Card_Category)

# Define the segmentation criteria (you can adjust these criteria as needed)
young_high_income <- with(credit_card_final, Customer_Age < 35 & Income_Category >= 4)
middle_age_high_income <- with(credit_card_final, Customer_Age >= 35 & Customer_Age <= 60 & Income_Category >= 4)
senior_high_income <- with(credit_card_final, Customer_Age > 60 & Income_Category >= 4)
young_low_income <- with(credit_card_final, Customer_Age < 35 & Income_Category < 4)
middle_age_low_income <- with(credit_card_final, Customer_Age >= 35 & Customer_Age <= 60 & Income_Category < 4)
senior_low_income <- with(credit_card_final, Customer_Age > 60 & Income_Category < 4)

# Assign segments to the original dataset
credit_card_final$Segment <- ifelse(young_high_income, "Young High Income",
                          ifelse(middle_age_high_income, "Middle Age High Income",
                          ifelse(senior_high_income, "Senior High Income",
                          ifelse(young_low_income, "Young Low Income",
                          ifelse(middle_age_low_income, "Middle Age Low Income",
                          ifelse(senior_low_income, "Senior Low Income", NA))))))

# Visualize the segmentation
ggplot(credit_card_final, aes(x = Customer_Age, y = Income_Category, color = Segment)) +
  geom_point() +
  labs(title = "Customer Segmentation by Age and Income Category")

# Summary statistics for each segment
segment_summary <- credit_card_final %>%
  group_by(Segment) %>%
  summarise(
    Average_Age = mean(Customer_Age),
    Average_Income = mean(Income_Category)
  )
print(segment_summary)
## # A tibble: 6 × 3
##   Segment                Average_Age Average_Income
##   <chr>                        <dbl>          <dbl>
## 1 Middle Age High Income        46.9             NA
## 2 Middle Age Low Income         46.8             NA
## 3 Senior High Income            63.4             NA
## 4 Senior Low Income             62.5             NA
## 5 Young High Income             30.6             NA
## 6 Young Low Income              31.5             NA
# Segment customers based on Card Category
credit_card_final$Card_Segment <- ifelse(credit_card_final$Card_Category == "Blue", "Blue Card",
                        ifelse(credit_card_final$Card_Category == "Silver", "Silver Card",
                        ifelse(credit_card_final$Card_Category == "Gold", "Gold Card",
                        ifelse(credit_card_final$Card_Category == "Platinum", "Platinum Card", NA))))

# Visualize the Card Category segmentation
ggplot(credit_card_final, aes(x = Customer_Age, y = Income_Category, color = Card_Segment)) +
  geom_point() +
  labs(title = "Customer Segmentation by Age and Income Category (Card Category)")

# Summary statistics for each Card Category segment
card_category_summary <- credit_card_final %>%
  group_by(Card_Segment) %>%
  summarise(
    Average_Age = mean(Customer_Age),
    Average_Income = mean(Income_Category)
  )
print(card_category_summary)
## # A tibble: 4 × 3
##   Card_Segment  Average_Age Average_Income
##   <chr>               <dbl>          <dbl>
## 1 Blue Card            46.4             NA
## 2 Gold Card            45.4             NA
## 3 Platinum Card        47.5             NA
## 4 Silver Card          45.7             NA

DATA DISTRIBUTION OF NUMERICAL VARIABLES:

library(ggplot2)

# Define a list of column names and corresponding bin widths
column_binwidths <- list(
  "Customer_Age" = 3,
  "Total_Trans_Amt" = 500,
  "Total_Trans_Ct" = 7,
  "Credit_Limit" = 800
)

# Loop through the columns and create histograms with respective bin widths and density curves
for (column in names(column_binwidths)) {
  # Get the bin width for the current column
  binwidth <- column_binwidths[[column]]
  
  # Create a histogram with density curve
  hist_plot <- ggplot(credit_card_final, aes(x = .data[[column]])) +
    geom_histogram(binwidth = binwidth, fill = "pink", color = "black", aes(y = ..density..)) +
    geom_density(alpha = 0.5, color = "green") +
    labs(title = paste("Histogram with Density Curve of", column)) +
    theme_minimal()
  
  # Display the histogram with density curve
  print(hist_plot)
}

CORRELATION ANALYSIS:

#install.packages("corrplot")
library(corrplot)

# Calculate the correlation matrix for numerical variables
correlation_matrix <- cor(credit_card_final[, numerical_variables])

# Create a correlation plot
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 50)

# Display the correlation plot
# Calculate the correlation matrix
correlation_matrix <- cor(credit_card_final[, numerical_variables])

# Find highly correlated pairs
threshold <- 0.7  # Set your desired correlation threshold

# Create an empty matrix to store the results
related_pairs <- matrix(nrow = 0, ncol = 2)

# Loop through the correlation matrix to find related pairs
for (i in 1:(length(numerical_variables) - 1)) {
  for (j in (i + 1):length(numerical_variables)) {
    if (abs(correlation_matrix[i, j]) >= threshold) {
      related_pairs <- rbind(related_pairs, c(numerical_variables[i], numerical_variables[j]))
    }
  }
}

# Display related variable pairs
print(related_pairs)
##      [,1]              [,2]             
## [1,] "Customer_Age"    "Months_on_book" 
## [2,] "Credit_Limit"    "Avg_Open_To_Buy"
## [3,] "Total_Trans_Amt" "Total_Trans_Ct"

CORRELATION COEFFICIENTS:

# Define the pairs of variables
variable_pairs <- list(
  c("Customer_Age", "Months_on_book"),
  c("Credit_Limit", "Avg_Open_To_Buy"),
  c("Total_Trans_Amt", "Total_Trans_Ct")
)

# Function to calculate and display correlation
calculate_and_display_correlation <- function(pair) {
  variable1 <- pair[1]
  variable2 <- pair[2]
  
  # Extract data for the pair
  data_pair <- credit_card_final[, c(variable1, variable2)]
  
  # Calculate correlation
  correlation_coefficient <- cor(data_pair[[variable1]], data_pair[[variable2]])
  
  # Print the correlation result
  cat("Correlation between", variable1, "and", variable2, "is", correlation_coefficient, "\n")
}

# Loop through variable pairs and calculate/display correlations
for (pair in variable_pairs) {
  calculate_and_display_correlation(pair)
}
## Correlation between Customer_Age and Months_on_book is 0.789 
## Correlation between Credit_Limit and Avg_Open_To_Buy is 0.996 
## Correlation between Total_Trans_Amt and Total_Trans_Ct is 0.807

PLOTTING THE PAIRS:

# Load necessary libraries
library(ggplot2)

# List of related variable pairs
related_pairs <- list(
  list("Customer_Age", "Months_on_book"),
  list("Credit_Limit", "Avg_Open_To_Buy"),
  list("Total_Trans_Amt", "Total_Trans_Ct")
)

# Create a function to plot scatter plots with regression lines for a pair of related variables
plot_scatter_with_regression <- function(pair) {
  # Extract variable names
  var1 <- pair[[1]]
  var2 <- pair[[2]]
  
  # Create a scatter plot
  scatter_plot <- ggplot(credit_card_final, aes(x = .data[[var1]], y = .data[[var2]])) +
    geom_point(alpha = 0.6, size = 3, color = "orchid") +
    geom_smooth(method = "lm", se = FALSE, color = "dodgerblue", size = 1) +
    labs(
      title = paste("Scatter Plot of", var1, "vs", var2),
      x = var1,
      y = var2
    ) +
    theme_minimal()
  
  # Set the size of the plot
  options(repr.plot.width = 6, repr.plot.height = 4)
  
  # Display the scatter plot with a regression line
  print(scatter_plot)
}

# Loop through related variable pairs and create individual scatter plots
for (pair in related_pairs) {
  plot_scatter_with_regression(pair)
}

# Load necessary libraries
library(ggplot2)
library(gridExtra)

# Define the pairs of variables
variable_pairs <- list(
  c("Customer_Age", "Months_on_book"),
  c("Credit_Limit", "Avg_Open_To_Buy"),
  c("Total_Trans_Amt", "Total_Trans_Ct")
)

# Create a function to plot histograms and QQ-plots for a variable pair
plot_histogram_qqpair <- function(pair) {
  variable1 <- pair[1]
  variable2 <- pair[2]
  
  # Extract data for the pair
  data_pair <- credit_card_final[, c(variable1, variable2)]
  
  # Create a histogram for variable 1
  hist_var1 <- ggplot(data_pair, aes(x = .data[[variable1]])) +
    geom_histogram(binwidth = 10, color = "black", fill = "darkgoldenrod") +
    labs(title = paste("Histogram of", variable1)) +
    theme_minimal()
  
  # Create a histogram for variable 2
  hist_var2 <- ggplot(data_pair, aes(x = .data[[variable2]])) +
    geom_histogram(binwidth = 10, color = "black", fill = "tomato") +
    labs(title = paste("Histogram of", variable2)) +
    theme_minimal()
  
  # Create a QQ-plot
  qqplot <- ggplot(data_pair, aes(sample = .data[[variable1]])) +
    geom_qq() +
    geom_qq_line(color = "darkolivegreen") +
    labs(title = paste("QQ-Plot of", variable1, "vs Normal Distribution")) +
    theme_minimal()
  
  # Arrange histograms and QQ-plot in one grid
  grid.arrange(hist_var1, hist_var2, qqplot, ncol = 2)
}

# Loop through variable pairs and plot histograms and QQ-plots
for (pair in variable_pairs) {
  plot_histogram_qqpair(pair)
}

OUTLIER DETECTION AND REMOVAL:

# Create a copy of the dataset to avoid modifying the original data
credit_card_no_outliers <- credit_card_final
# Outlier Identifiaction and removal from Customer Age
  df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Customer_Age, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 2 
## Proportion (%) of outliers: 0 
## Mean of the outliers: 71.5 
## Mean without removing outliers: 46.3 
## Mean if we remove outliers: 46.3 
## Outliers successfully removed
  credit_card_no_outliers[["Customer_Age"]][df_outliers$outliers] <- NA

  # Outlier Identifiaction and removal from Credit Limit
  df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Credit_Limit, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 984 
## Proportion (%) of outliers: 10.8 
## Mean of the outliers: 31551 
## Mean without removing outliers: 8632 
## Mean if we remove outliers: 6165 
## Outliers successfully removed
  credit_card_no_outliers[["Credit_Limit"]][df_outliers$outliers] <- NA
  
# Outlier Identifiaction and removal from Months on Book
  df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Months_on_book, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 386 
## Proportion (%) of outliers: 4 
## Mean of the outliers: 35.5 
## Mean without removing outliers: 35.9 
## Mean if we remove outliers: 35.9 
## Outliers successfully removed
  credit_card_no_outliers[["Months_on_book"]][df_outliers$outliers] <- NA
  
# Outlier Identifiaction and removal from Total Revolving Balance
  df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Total_Revolving_Bal, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 0 
## Proportion (%) of outliers: 0 
## Mean of the outliers: NaN 
## Mean without removing outliers: 1163 
## Mean if we remove outliers: 1163 
## Outliers successfully removed
  credit_card_no_outliers[["Total_Revolving_Bal"]][df_outliers$outliers] <- NA
  
# Outlier Identifiaction and removal from Total Transaction Amount 
  df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Total_Trans_Amt, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 896 
## Proportion (%) of outliers: 9.7 
## Mean of the outliers: 13770 
## Mean without removing outliers: 4404 
## Mean if we remove outliers: 3495 
## Outliers successfully removed
  credit_card_no_outliers[["Total_Trans_Amt"]][df_outliers$outliers] <- NA
  
# Outlier Identifiaction and removal from Total Transaction Ct
  df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Total_Trans_Ct, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 2 
## Proportion (%) of outliers: 0 
## Mean of the outliers: 138 
## Mean without removing outliers: 64.9 
## Mean if we remove outliers: 64.8 
## Outliers successfully removed
  credit_card_no_outliers[["Total_Trans_Ct"]][df_outliers$outliers] <- NA

# Outlier Identifiaction and removal from Average Open to Buy
  df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Avg_Open_To_Buy, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 963 
## Proportion (%) of outliers: 10.5 
## Mean of the outliers: 30532 
## Mean without removing outliers: 7469 
## Mean if we remove outliers: 5046 
## Outliers successfully removed
  credit_card_no_outliers[["Avg_Open_To_Buy"]][df_outliers$outliers] <- NA
  
# Outlier Identifiaction and removal from Average Utilization Ratio
  df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Avg_Utilization_Ratio, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 0 
## Proportion (%) of outliers: 0 
## Mean of the outliers: NaN 
## Mean without removing outliers: 0.27 
## Mean if we remove outliers: 0.27 
## Outliers successfully removed
  credit_card_no_outliers[["Avg_Utilization_Ratio"]][df_outliers$outliers] <- NA
library(ggplot2)
library(gridExtra)

# Define the pairs of variables
variable_pairs <- list(
  c("Customer_Age", "Months_on_book"),
  c("Credit_Limit", "Avg_Open_To_Buy"),
  c("Total_Trans_Amt", "Total_Trans_Ct")
)

# Create a function to plot histograms and QQ-plots for a variable pair
plot_histogram_qqpair <- function(pair) {
  variable1 <- pair[1]
  variable2 <- pair[2]
  
  # Extract data for the pair
  data_pair <- credit_card_no_outliers[, c(variable1, variable2)]
  
  # Create a histogram for variable 1
  hist_var1 <- ggplot(data_pair, aes(x = .data[[variable1]])) +
    geom_histogram(binwidth = 10, color = "black", fill = "darkgoldenrod") +
    labs(title = paste("Histogram of", variable1)) +
    theme_minimal()
  
  # Create a histogram for variable 2
  hist_var2 <- ggplot(data_pair, aes(x = .data[[variable2]])) +
    geom_histogram(binwidth = 10, color = "black", fill = "tomato") +
    labs(title = paste("Histogram of", variable2)) +
    theme_minimal()
  
  # Create a QQ-plot
  qqplot <- ggplot(data_pair, aes(sample = .data[[variable1]])) +
    geom_qq() +
    geom_qq_line(color = "darkolivegreen") +
    labs(title = paste("QQ-Plot of", variable1, "vs Normal Distribution")) +
    theme_minimal()
  
  # Arrange histograms and QQ-plot in one grid
  grid.arrange(hist_var1, hist_var2, qqplot, ncol = 2)
}

# Loop through variable pairs and plot histograms and QQ-plots
for (pair in variable_pairs) {
  plot_histogram_qqpair(pair)
}

# Calculate the standard deviation for each specified column
std_dev_Customer_Age <- sd(credit_card_no_outliers$Customer_Age)
std_dev_Credit_Limit <- sd(credit_card_no_outliers$Credit_Limit)
std_dev_Months_on_book <- sd(credit_card_no_outliers$Months_on_book)
std_dev_Avg_Open_To_Buy <- sd(credit_card_no_outliers$Avg_Open_To_Buy)
std_dev_Avg_Utilization_Ratio <- sd(credit_card_no_outliers$Avg_Utilization_Ratio)
std_dev_Total_Revolving_Bal <- sd(credit_card_no_outliers$Total_Revolving_Bal)
std_dev_Total_Trans_Amt <- sd(credit_card_no_outliers$Total_Trans_Amt)
std_dev_Total_Trans_Ct <- sd(credit_card_no_outliers$Total_Trans_Ct)

# Create a data frame with column names and standard deviations
std_dev_table <- data.frame(
  Column = c("Customer_Age", "Credit_Limit", "Months_on_book", "Avg_Open_To_Buy",
             "Avg_Utilization_Ratio", "Total_Revolving_Bal", "Total_Trans_Amt", "Total_Trans_Ct"),
  Standard_Deviation = c(std_dev_Customer_Age, std_dev_Credit_Limit, std_dev_Months_on_book, std_dev_Avg_Open_To_Buy,
                         std_dev_Avg_Utilization_Ratio, std_dev_Total_Revolving_Bal, std_dev_Total_Trans_Amt, std_dev_Total_Trans_Ct)
)

# Load the knitr package
library(knitr)

# Print the table using knitr
print(std_dev_table, align = "c", col.names = c("Column", "Standard Deviation"), caption = "Standard Deviations")
##                  Column Standard_Deviation
## 1          Customer_Age              8.017
## 2          Credit_Limit           9088.777
## 3        Months_on_book              7.986
## 4       Avg_Open_To_Buy           9090.685
## 5 Avg_Utilization_Ratio              0.276
## 6   Total_Revolving_Bal            814.987
## 7       Total_Trans_Amt           3397.129
## 8        Total_Trans_Ct             23.473
# Perform t-test
t_test_result <- t.test(credit_card_no_outliers$Total_Trans_Amt ~ credit_card_no_outliers$Gender)

# Print the results
print("T-test for Total_Trans_Amt and Gender")
## [1] "T-test for Total_Trans_Amt and Gender"
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  credit_card_no_outliers$Total_Trans_Amt by credit_card_no_outliers$Gender
## t = -2, df = 8914, p-value = 0.01
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -303.9  -34.9
## sample estimates:
## mean in group F mean in group M 
##            4324            4494
# Create a contingency table of income category and churn status
table_data <- table(credit_card_no_outliers$Income_Category, credit_card_no_outliers$Income_Category)

# Perform chi-square test
chi_sq_test <- t.test(table_data)

# Print the test results
chi_sq_test
## 
##  One Sample t-test
## 
## data:  table_data
## t = 2, df = 35, p-value = 0.03
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   31.2 531.4
## sample estimates:
## mean of x 
##       281
# Plot 1
ggplot(credit_card_final, aes(x=Credit_Limit, y=Avg_Utilization_Ratio)) +
  geom_point(alpha=0.5, colour='blue', shape=4) +
  ggtitle("Example Plot") +
  xlab("Credit Limit") +
  ylab("Average Utilization Ratio") +
  theme_minimal()

# Plot 2
ggplot(credit_card_final[credit_card_final$Income_Category != "Unknown", ], 
       aes(x = Customer_Age, y = Credit_Limit, color = Income_Category)) +
  geom_point(alpha = 0.6) +
  ggtitle("Relation Between Credit Limit Vs Age Coloured by Income_Category") +
  xlab("Age") +
  ylab("Credit Limit") +
  scale_colour_viridis_d() +
  theme_minimal() +
  theme(legend.position = "right")

# Perform t-test for Contacts_Count_12_mon between attrition and non-attrition groups
attrition_group <- credit_card_final$Contacts_Count_12_mon[credit_card_final$Attrition_Flag == "Attrited Customer"]
non_attrition_group <- credit_card_final$Contacts_Count_12_mon[credit_card_final$Attrition_Flag == "Existing Customer"]

t_test_result <- t.test(attrition_group, non_attrition_group)

# Print t-test results
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  attrition_group and non_attrition_group
## t = 21, df = 2280, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.558 0.674
## sample estimates:
## mean of x mean of y 
##      2.97      2.36
contingency_table_income <- table(credit_card_no_outliers$Income_Category , credit_card_no_outliers$Attrition_Flag)

ChiS2_test_income <- chisq.test(contingency_table_income)
print(contingency_table_income)
##                 
##                  Attrited Customer Existing Customer
##   $120K +                      126               601
##   $40K - $60K                  271              1519
##   $60K - $80K                  189              1213
##   $80K - $120K                 242              1293
##   Less than $40K               612              2949
##   Unknown                      187               925
print(ChiS2_test_income)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table_income
## X-squared = 13, df = 5, p-value = 0.03
Attrition_rate <- (contingency_table_income[,"Attrited Customer"]/ (contingency_table_income[,"Attrited Customer"]+contingency_table_income[,"Existing Customer"]))*100
print(Attrition_rate)
##        $120K +    $40K - $60K    $60K - $80K   $80K - $120K Less than $40K 
##           17.3           15.1           13.5           15.8           17.2 
##        Unknown 
##           16.8
# load libraries
library(dplyr)
library(ggplot2)

# group data
grouped_df <- credit_card_no_outliers %>%
               group_by(Income_Category) %>%
               summarise(Transaction_mean = mean(Total_Trans_Amt, na.rm = TRUE),
                         Transaction_median = median(Total_Trans_Amt, na.rm = TRUE),
                         n = n())

# plotting
ggplot(grouped_df, aes(x=Income_Category)) +
  geom_col(aes(y=Transaction_mean, fill="Mean"), width=0.4, position=position_dodge(width=0.5)) +
  geom_col(aes(y=Transaction_median, fill="Median"), width=0.4, position=position_dodge(width=0.5)) +
  labs(fill='Transaction', y='Transaction Value', title='Transaction Behaviour by Income Category') +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45, hjust=1))

male_utilization <- credit_card_no_outliers[credit_card_no_outliers$Gender=="M",]$Avg_Utilization_Ratio
female_utilization <- credit_card_no_outliers[credit_card_no_outliers$Gender=="F",]$Avg_Utilization_Ratio

tt_result <- t.test(male_utilization, female_utilization)

print(tt_result)
## 
##  Welch Two Sample t-test
## 
## data:  male_utilization and female_utilization
## t = -27, df = 9994, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.153 -0.132
## sample estimates:
## mean of x mean of y 
##     0.200     0.342